In [1]:
    
import pandas as pd
    
In [2]:
    
df = pd.read_csv("url_expanded.full.txt", sep="\t", header=None)
df.shape
    
    Out[2]:
(97512, 3)
In [3]:
    
df.head()
    
    Out[3]:
  
    
       
      0 
      1 
      2 
     
  
  
    
      0 
      http://www.investmentnews.com/article/20160801... 
      http://www.investmentnews.com/article/20160801... 
      0 
     
    
      1 
      http://ow.ly/3avNPe 
      https://www.reddit.com/r/cahideas/comments/42i... 
      0 
     
    
      2 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      3 
     
    
      3 
      http://ln.is/mabelsaveforschool.com/gbEtv 
      http://linkis.com/mabelsaveforschool.com/gbEtv 
      0 
     
    
      4 
      http://kiw.im/16LfJirkfzE 
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 
      0 
     
  
In [4]:
    
df.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df.head()
    
    Out[4]:
  
    
       
      URL 
      EXPANDED 
      EXPANDED_STATUS 
     
  
  
    
      0 
      http://www.investmentnews.com/article/20160801... 
      http://www.investmentnews.com/article/20160801... 
      0 
     
    
      1 
      http://ow.ly/3avNPe 
      https://www.reddit.com/r/cahideas/comments/42i... 
      0 
     
    
      2 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      3 
     
    
      3 
      http://ln.is/mabelsaveforschool.com/gbEtv 
      http://linkis.com/mabelsaveforschool.com/gbEtv 
      0 
     
    
      4 
      http://kiw.im/16LfJirkfzE 
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 
      0 
     
  
In [5]:
    
df.EXPANDED_STATUS.value_counts()
    
    Out[5]:
0    92362
1     3651
3     1489
2       10
Name: EXPANDED_STATUS, dtype: int64
In [6]:
    
df[df.EXPANDED_STATUS == 1].head()
    
    Out[6]:
  
    
       
      URL 
      EXPANDED 
      EXPANDED_STATUS 
     
  
  
    
      14 
      http://dailydose.topratedviral.com/article/wom... 
      http://dailydose.topratedviral.com/article/wom... 
      1 
     
    
      15 
      http://gvwy.io/v9h3w9l 
      http://mabelsaveforschool.com/contest-entry 
      1 
     
    
      23 
      http://s.einnews.com/tGmrKnfQ1C 
      http://s.einnews.com/tGmrKnfQ1C 
      1 
     
    
      30 
      http://gvwy.io/lygewah 
      http://mabelsaveforschool.com/contest-entry 
      1 
     
    
      59 
      http://gvwy.io/3ogfrpp 
      http://mabelsaveforschool.com/contest-entry 
      1 
     
  
In [7]:
    
df[df.EXPANDED_STATUS == 3].head(100)
    
    Out[7]:
  
    
       
      URL 
      EXPANDED 
      EXPANDED_STATUS 
     
  
  
    
      2 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      3 
     
    
      27 
      http://dlvr.it/KxCjYs 
      http://post/142016360553?utm_source=dlvr.it&ut... 
      3 
     
    
      64 
      http://soco.space/-m0zuC 
      http://soco.space/-m0zuC 
      3 
     
    
      120 
      http://seusnews.com/?p=1756 
      http://seusnews.com/?p=1756 
      3 
     
    
      145 
      http://deals.buycheap2day.com/US/lndng-st/twt/... 
      http://deals.buycheap2day.com/US/lndng-st/twt/... 
      3 
     
    
      240 
      http://sociably.me/L0pQLX 
      http://sociably.me/L0pQLX 
      3 
     
    
      257 
      http://j.mp/1Zyj2Lp 
      http://feeds.huffingtonpost.com/c/35496/f/6770... 
      3 
     
    
      536 
      https://videotube.livehost.fr/2016/11/28/learn... 
      https://videotube.livehost.fr/2016/11/28/learn... 
      3 
     
    
      548 
      http://dlvr.it/Kv4tfx 
      http://vulture.feedsportal.com/c/35348/f/66160... 
      3 
     
    
      560 
      http://www.fashionisme.us/2013/07/useful-foods... 
      http://www.fashionisme.us/2013/07/useful-foods... 
      3 
     
    
      755 
      http://www.relevantmagazine.com/reject-apathy/... 
      https://relevantmagazine.com//reject-apathy/vi... 
      3 
     
    
      829 
      http://dlvr.it/L1ZVtd 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      906 
      http://Blaylock.Dr.Group 
      http://Blaylock.Dr.Group/ 
      3 
     
    
      1240 
      http://reference-and-education.journaleus.com/... 
      http://reference-and-education.journaleus.com/... 
      3 
     
    
      1335 
      http://bit.ly/1UFhfkJ 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      1365 
      http://observer.gm/vaccination-campaign-agains... 
      http://observer.gm/vaccination-campaign-agains... 
      3 
     
    
      1455 
      http://www.Feed24hNews.com/7q9It 
      http://www.Feed24hNews.com/7q9It 
      3 
     
    
      1582 
      https://shar.es/1CMQ2r 
      http://www.authornicolewalker.com/raising_auti... 
      3 
     
    
      1626 
      http://inbrief.media/2016/01/20/when-can-the-f... 
      http://inbrief.media/2016/01/20/when-can-the-f... 
      3 
     
    
      1710 
      http://goo.gl/y7FzIj 
      http://www.dainikbhaskar.tv/apple/aclu-other-p... 
      3 
     
    
      1769 
      http://roots.ly/lHTptg 
      https://roots.ly/lHTptg 
      3 
     
    
      1836 
      http://dlvr.it/MLBH4l 
      http://master-of-education.goitstar.com/post/1... 
      3 
     
    
      1894 
      http://dlvr.it/MDn1j4 
      http://keepliberty.org/2016/09/10/us-has-spent... 
      3 
     
    
      1919 
      http://bit.ly/ONymEF 
      http://www.longbeachnsw.com.au/ 
      3 
     
    
      1953 
      http://fb.me/2S1kcWQTE 
      http://soco.space/qlXx9e 
      3 
     
    
      2089 
      http://ow.ly/3bs6KP 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      2256 
      http://bit.ly/21Ootr4 
      http://cnet.com.feedsportal.com/c/34938/f/6450... 
      3 
     
    
      2432 
      http://fb.me/3k28jNlpW 
      http://personalhealthdiary.co/fda-announce-tha... 
      3 
     
    
      2438 
      http://fb.me/3RERphXAT 
      http://personalhealthdiary.co/fda-announce-tha... 
      3 
     
    
      2582 
      http://goo.gl/fb/bpUe5M 
      http://post/138982303064?utm_source=feedburner... 
      3 
     
    
      ... 
      ... 
      ... 
      ... 
     
    
      4974 
      http://ow.ly/3a5TfG 
      http://viralinstant.com/no-adverse-side-effect... 
      3 
     
    
      4999 
      http://bit.ly/1TZDQdy 
      http://cnet.com.feedsportal.com/c/34938/f/6450... 
      3 
     
    
      5037 
      http://dlvr.it/KkRMV8 
      http://www.ynn.io/latest-best-sports-news/garb... 
      3 
     
    
      5229 
      http://bit.ly/1Y7rkt2 
      http://israelnewsreport.net/open-letter-on-don... 
      3 
     
    
      5249 
      http://dld.bz/erX9w 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      5295 
      http://salon.com.feedsportal.com/c/35105/f/648... 
      http://salon.com.feedsportal.com/c/35105/f/648... 
      3 
     
    
      5343 
      http://dlvr.it/L32mDB 
      http://rss.feedsportal.com/c/34793/f/641580/s/... 
      3 
     
    
      5370 
      http://soco.space/cM3jRw 
      http://soco.space/cM3jRw 
      3 
     
    
      5372 
      http://dlvr.it/LLBSXN 
      http://can10.jkmesh.com/post/144533385307?utm_... 
      3 
     
    
      5459 
      http://dlvr.it/DJy62M 
      http://www.techwens.com/blackberry-refutes-cla... 
      3 
     
    
      5460 
      http://smartbetty.me/1QT1SWV 
      http://smartbetty.me/1QT1SWV 
      3 
     
    
      5544 
      http://www.sciencedirect.com.libproxy-wb.imf.o... 
      http://www.sciencedirect.com.libproxy-wb.imf.o... 
      3 
     
    
      5584 
      http://ift.tt/28uIJ5f 
      http://bactema.com/orlando-terror-focus-alread... 
      3 
     
    
      5597 
      http://bit.ly/1VEZNzx 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      5739 
      http://live.americarisingpac.org/posts/212 
      http://live.americarisingpac.org/posts/212 
      3 
     
    
      5763 
      https://www.ataxia.org/pdf/SporadicAtaxias.pdf 
      https://www.ataxia.org/pdf/SporadicAtaxias.pdf 
      3 
     
    
      5831 
      http://bit.ly/1PCir7g 
      http://zdnet.com.feedsportal.com/c/35462/f/675... 
      3 
     
    
      5910 
      http://www.wholechildeducation.org/blog/build-... 
      http://www.wholechildeducation.org/blog/build-... 
      3 
     
    
      5993 
      http://www.alltriberr.com/poll-finds-national-... 
      http://www.alltriberr.com/poll-finds-national-... 
      3 
     
    
      6026 
      http://TheBlogToday.co.vu/q0TRBI 
      http://TheBlogToday.co.vu/q0TRBI 
      3 
     
    
      6043 
      http://goo.gl/Hd5WtH 
      http://casyope.info/5440903-13803496 
      3 
     
    
      6074 
      http://bit.ly/1ITFKdd 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      6085 
      http://dlvr.it/DFWbxR 
      http://zerohedge.feedsportal.com/c/34894/f/645... 
      3 
     
    
      6089 
      http://bit.ly/2aqzM87 
      http://beautifulhairstyles.14p.in/block-the-su... 
      3 
     
    
      6093 
      http://nyv.me/l/LJkg 
      http://nyv.me/l/LJkg 
      3 
     
    
      6186 
      http://ift.tt/1Yu6Pq0 
      http://nydailynews.com.feedsportal.com/c/34148... 
      3 
     
    
      6239 
      http://huff.to/1PBu5NG 
      http://feeds.huffingtonpost.com/c/35496/f/6770... 
      3 
     
    
      6242 
      http://www.PittsburghFor.me 
      http://www.PittsburghFor.me/ 
      3 
     
    
      6250 
      http://bit.ly/RentShelby 
      http://ourridelife.com/2016/03/23/you-can-rent... 
      3 
     
    
      6363 
      http://dlvr.it/L1bvpM 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
  
100 rows × 3 columns
In [8]:
    
df[df.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()
    
    Out[8]:
reuters.us.feedsportal.com                   143
feeds.huffingtonpost.com                      92
soco.space                                    63
personalhealthdiary.co                        53
www.ynn.io                                    45
rss.feedsportal.com                           30
post                                          19
l.herald.ly                                   17
www.trendgizmo.com                            17
cnet.com.feedsportal.com                      16
zerohedge.feedsportal.com                     15
pumpkin-dukan-diet.7legend.net                15
zdnet.com.feedsportal.com                     15
healthlogics.press                            15
nydailynews.com.feedsportal.com               14
appleinsider.com.feedsportal.com              14
dailyeeuu.tusueldo.com                        13
politics.tusueldo.com                         13
telegraph.feedsportal.com                     13
www.techwens.com                              12
advertising-education.live-newsx.com          12
www.youthsnews.com                            12
master-of-education.goitstar.com              12
stratcom.kma-assc.com                         12
ndtv.com.feedsportal.com                      11
www.dainikbhaskar.tv                          11
100-singalong-songs-for-kids.goitstar.com     10
sociably.me                                   10
TheBlogToday.co.vu                            10
gbr.jkmesh.com                                10
                                            ... 
overdrive.ae                                   1
basketballfanzone.org                          1
csnn.gov                                       1
mrtopstep.com.                                 1
nipple-huggers.com                             1
www.peerlyst.com                               1
messagwww.mharrell3.myrandf.biz                1
www.acscva.com                                 1
israelnewsreport.net                           1
newsinkansas.ml                                1
acclaimcollegecounseling.com                   1
www.HoustonFor.me                              1
got-tlc.info                                   1
amazinggiftsforall.top                         1
carsautosglobal.com                            1
www.highplainsdailynews.com                    1
www.houstonlocal.news                          1
donaldtrumpreviews.com                         1
www.usworldreport.com                          1
lfger.com                                      1
albania.jobs.forjobsearch.com                  1
forum.theworldnewsmedia.org                    1
suavebaes.com                                  1
www.ihealthbeat.org                            1
bestoflisticles.com                            1
jobsearch.com.de                               1
inbrief.media                                  1
1.black                                        1
roots.ly                                       1
rockymountainrv.com                            1
Name: EXPANDED, dtype: int64
In [9]:
    
df[(df.EXPANDED_STATUS == 3) & (df.EXPANDED.str.split("/").apply(lambda x: x[2]) == "www.huffingtonpost.com")].head()
    
    Out[9]:
  
    
       
      URL 
      EXPANDED 
      EXPANDED_STATUS 
     
  
  
  
In [10]:
    
df_err = pd.read_csv("url_expanded.error.1.txt", sep="\t", header=None)
df_err.shape
    
    Out[10]:
(1489, 3)
In [11]:
    
df_err.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df_err.head()
    
    Out[11]:
  
    
       
      URL 
      EXPANDED 
      EXPANDED_STATUS 
     
  
  
    
      0 
      http://ift.tt/1mBLaPF 
      http://reuters.us.feedsportal.com/c/35217/f/65... 
      3 
     
    
      1 
      http://logs.wsj.com/pharmalot/2015/06/08/merck... 
      http://logs.wsj.com/pharmalot/2015/06/08/merck... 
      3 
     
    
      2 
      http://bit.ly/1oRL1bE 
      http://rss.feedsportal.com/c/34793/f/641580/s/... 
      3 
     
    
      3 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      3 
     
    
      4 
      http://americagunban.com/moscow-says-usa-actio... 
      http://americagunban.com/moscow-says-usa-actio... 
      0 
     
  
In [12]:
    
df_err.EXPANDED_STATUS.value_counts()
    
    Out[12]:
3    1396
0      71
1      22
Name: EXPANDED_STATUS, dtype: int64
In [13]:
    
df_err[df_err.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()
    
    Out[13]:
reuters.us.feedsportal.com                   143
feeds.huffingtonpost.com                      92
soco.space                                    63
personalhealthdiary.co                        53
www.ynn.io                                    45
rss.feedsportal.com                           30
l.herald.ly                                   17
www.trendgizmo.com                            17
cnet.com.feedsportal.com                      16
zdnet.com.feedsportal.com                     15
pumpkin-dukan-diet.7legend.net                15
zerohedge.feedsportal.com                     15
appleinsider.com.feedsportal.com              14
nydailynews.com.feedsportal.com               14
politics.tusueldo.com                         13
telegraph.feedsportal.com                     13
dailyeeuu.tusueldo.com                        13
advertising-education.live-newsx.com          12
master-of-education.goitstar.com              12
www.techwens.com                              12
www.youthsnews.com                            12
stratcom.kma-assc.com                         12
www.dainikbhaskar.tv                          11
ndtv.com.feedsportal.com                      11
sociably.me                                   10
TheBlogToday.co.vu                            10
gbr.jkmesh.com                                10
dailyhobbies.net                              10
100-singalong-songs-for-kids.goitstar.com     10
nu.hackn.us                                    9
                                            ... 
www.samgotechnology.com                        1
Benghazi.You                                   1
www.meltsalad.com                              1
livehealthy-team.com                           1
techdaily.xyz                                  1
post                                           1
furbabypetpalace.com                           1
www.gadgets-4g.com                             1
cybermick.com                                  1
amazinggiftsforall.top                         1
COUNTRY.How                                    1
newsinkansas.ml                                1
www.houstonlocal.news                          1
lfger.com                                      1
albania.jobs.forjobsearch.com                  1
forum.theworldnewsmedia.org                    1
suavebaes.com                                  1
crenshaw.house.gov                             1
observer.gm                                    1
bestoflisticles.com                            1
Stranger.com                                   1
jobsearch.com.de                               1
orlando                                        1
1.black                                        1
carsautosglobal.com                            1
www.acscva.com                                 1
www.lidoautobody.com                           1
messagwww.mharrell3.myrandf.biz                1
www.thechildcaresquare.com                     1
rockymountainrv.com                            1
Name: EXPANDED, dtype: int64
In [14]:
    
df = df.set_index("URL")
df_err = df_err.set_index("URL")
df.shape, df_err.shape
    
    Out[14]:
((97512, 2), (1489, 2))
In [15]:
    
df.head()
    
    Out[15]:
  
    
       
      EXPANDED 
      EXPANDED_STATUS 
     
    
      URL 
       
       
     
  
  
    
      http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike 
      http://www.investmentnews.com/article/20160801... 
      0 
     
    
      http://ow.ly/3avNPe 
      https://www.reddit.com/r/cahideas/comments/42i... 
      0 
     
    
      http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/ 
      http://stratcom.kma-assc.com/uncategorized/pre... 
      3 
     
    
      http://ln.is/mabelsaveforschool.com/gbEtv 
      http://linkis.com/mabelsaveforschool.com/gbEtv 
      0 
     
    
      http://kiw.im/16LfJirkfzE 
      https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 
      0 
     
  
In [16]:
    
df.ix[df_err.index, ["EXPANDED", "EXPANDED_STATUS"]] = df_err[["EXPANDED", "EXPANDED_STATUS"]]
    
In [17]:
    
df.ix[df_err.index]["EXPANDED_STATUS"].value_counts()
    
    Out[17]:
3    1396
0      71
1      22
Name: EXPANDED_STATUS, dtype: int64
In [18]:
    
df.to_csv("url_expanded.merged.txt", sep="\t")
! head url_expanded.merged.txt
    
    
URL	EXPANDED	EXPANDED_STATUS
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	0
http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i3ew/w_farting_mid_rimjob/	0
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	3
http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0
http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0
http://fb.me/241s7UtEJ	https://www.facebook.com/story.php?story_fbid=1251035921618693&id=100001368900242	0
http://owl.li/XkyUO	https://www.youtube.com/watch?v=xtspq5T7B44&feature=em-uploademail	0
http://goo.gl/RTQ29	http://localbuzznetwork.com/clarksburg-wv-job-search/	0
http://buff.ly/1SNoZU6	http://weightlosslaw.com/01cdea672dbfe8?utm_content=bufferb9ed1&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer	0
In [ ]:
    
    
Content source: napsternxg/ControversialTweetAnalysis
Similar notebooks: